import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter('ignore',DeprecationWarning)
import seaborn as sns
import time
import copy
from pylab import rcParams
#import hdbscan
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
#from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn import metrics
from sklearn import metrics as mt
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import confusion_matrix as conf
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.cluster import KMeans
from tabulate import tabulate
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from __future__ import print_function
data_dir = '../data/'
data_file = 'mashable_clean_dataset_for_lab_02_task_02.csv'
file_2_read = data_dir + data_file
df = pd.read_csv(file_2_read)
df_cluster = copy.deepcopy(df)
del df_cluster['data_channel']
for column in ['LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04']:
new_col_name = 'ln_' + column
print (new_col_name)
df_cluster[new_col_name] = np.log(df_cluster[column]+1)
col_names = df_cluster.columns.values.tolist()
col_names
df_cluster.describe().T
from matplotlib import pyplot as plt
plt.style.use("ggplot")
%matplotlib inline
X1 = df_cluster[['ln_LDA_00','ln_LDA_01', 'ln_LDA_02', 'ln_LDA_03', 'ln_LDA_04']].values
plt.figure(figsize = (12,12))
plt.subplot(221)
plt.scatter(X1[:, 1], X1[:, 0],
s = 20,
alpha = 0.10)
plt.xlabel('LDA_00'), plt.ylabel('LDA_01')
plt.grid()
plt.title('LDA_00 vs. LDA_01')
plt.subplot(222)
plt.scatter(X1[:, 2], X1[:, 0],
s = 20,
alpha = 0.10)
plt.xlabel('LDA_00'), plt.ylabel('LDA_02')
plt.grid()
plt.title('LDA_00 vs. LDA_01')
plt.subplot(223)
plt.scatter(X1[:, 3], X1[:, 0],
s = 20,
alpha = 0.10)
plt.xlabel('LDA_00'), plt.ylabel('LDA_03')
plt.grid()
plt.title('LDA_00 vs. LDA_01')
plt.subplot(224)
plt.scatter(X1[:, 4], X1[:, 0],
s = 20,
alpha = 0.01)
plt.xlabel('LDA_00'), plt.ylabel('LDA_04')
plt.grid()
plt.title('LDA_00 vs. LDA_01')
plt.show();
for n_lda in range(2, 12):
X1 = df_cluster[['ln_LDA_00','ln_LDA_01', 'ln_LDA_02', 'ln_LDA_03', 'ln_LDA_04']]
cls_lda = KMeans(n_clusters = n_lda,
init = 'k-means++',
random_state = 1)
cls_lda.fit(X1)
kmeans_labels = cls_lda.labels_ # the labels from kmeans clustering
kmeans_centers = cls_lda.cluster_centers_
kmeans_inertia = cls_lda.inertia_
print ("n_lda = ", n_lda)
print ("inertia = ", kmeans_inertia)
plt.figure(figsize=(12, 12));
plt.subplot(221);
X1 = X1.values;
plt.scatter(X1[:, 0], X1[:, 1],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.text(0.8, 0.8,
kmeans_inertia)
plt.xlabel('LDA_00'), plt.ylabel('LDA_01');
plt.grid();
plt.subplot(222);
plt.scatter(X1[:, 0], X1[:, 2],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 2],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.xlabel('LDA_00'), plt.ylabel('LDA_02');
plt.grid();
plt.subplot(223);
plt.scatter(X1[:, 0], X1[:, 3],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 3],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.xlabel('LDA_00'), plt.ylabel('LDA_03');
plt.grid();
plt.subplot(224);
plt.scatter(X1[:, 0], X1[:, 4],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 4],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.xlabel('LDA_00'), plt.ylabel('LDA_04');
plt.grid();
plt.show();
for n_lda in range(2, 10):
X1 = df_cluster[['ln_num_imgs','ln_num_videos', 'ln_num_hrefs']]
cls_lda = KMeans(n_clusters = n_lda,
init = 'k-means++',
random_state = 1)
cls_lda.fit(X1)
kmeans_labels = cls_lda.labels_ # the labels from kmeans clustering
kmeans_centers = cls_lda.cluster_centers_
kmeans_inertia = cls_lda.inertia_
print ("n_lda = ", n_lda)
print ("inertia = ", kmeans_inertia)
plt.figure(figsize = (16, 8));
plt.subplot(131);
X1 = X1.values;
plt.scatter(X1[:, 0], X1[:, 1],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 1],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.text(0.8, 0.8,
kmeans_inertia)
plt.xlabel('images'), plt.ylabel('videos');
plt.grid();
plt.subplot(132);
plt.scatter(X1[:, 0], X1[:, 2],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 0], kmeans_centers[:, 2],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.xlabel('images'), plt.ylabel('hrefs');
plt.grid();
plt.subplot(133);
plt.scatter(X1[:, 1], X1[:, 2],
c = kmeans_labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05);
plt.scatter(kmeans_centers[:, 1], kmeans_centers[:, 2],
c = range(n_lda),
cmap = plt.cm.rainbow,
s = 400,
linewidths = 1.0,
marker = '^',
edgecolors = 'black',
alpha = 0.90);
plt.xlabel('videos'), plt.ylabel('hrefs');
plt.grid();
plt.show();
X1 = df_cluster
for n_lda in range(2, 50):
cls_lda = KMeans(n_clusters = n_lda,
init = 'k-means++',
random_state = 1);
cls_lda.fit(X1);
kmeans_labels = cls_lda.labels_ # the labels from kmeans clustering
kmeans_centers = cls_lda.cluster_centers_
kmeans_inertia = cls_lda.inertia_
print ("n_lda, inertia ", n_lda, kmeans_inertia)
http://hdbscan.readthedocs.io/en/latest/comparing_clustering_algorithms.html
DBSCAN is a density based algorithm – it assumes clusters for dense regions. It is also the first actual clustering algorithm we’ve looked at: it doesn’t require that every point be assigned to a cluster and hence doesn’t partition the data, but instead extracts the ‘dense’ clusters and leaves sparse background classified as ‘noise’.
In practice DBSCAN is related to agglomerative clustering.
As a first step DBSCAN transforms the space according to the density of the data: points in dense regions are left alone, while points in sparse regions are moved further away. Applying single linkage clustering to the transformed space results in a dendrogram, which we cut according to a distance parameter (called epsilon or eps in many implementations) to get clusters. Importantly any singleton clusters at that cut level are deemed to be ‘noise’ and left unclustered. This provides several advantages: we get the manifold following behaviour of agglomerative clustering, and we get actual clustering as opposed to partitioning. Better yet, since we can frame the algorithm in terms of local region queries we can use various tricks such as kdtrees to get exceptionally good performance and scale to dataset sizes that are otherwise unapproachable with algorithms other than K-Means.
There are some catches however. Obviously epsilon can be hard to pick; you can do some data analysis and get a good guess, but the algorithm can be quite sensitive to the choice of the parameter. The density based transformation depends on another parameter (min_samples in sklearn).
Finally the combination of min_samples and eps amounts to a choice of density and the clustering only finds clusters at or above that density; if your data has variable density clusters then DBSCAN is either going to miss them, split them up, or lump some of them together depending on your parameter choices.
So, in summary:
So how does it cluster our test dataset? I played with a few epsilon values until I got somethign reasonable, but there was little science to this – getting the parameters right can be hard.
%%time
from sklearn.cluster import DBSCAN
params = []
for eps in [0.005, 0.0075, 0.010, 0.020, 0.05, 0.10]:
for min_pts in range (20, 200, 20):
X1 = df[['ln_LDA_00','ln_LDA_01', 'ln_LDA_02', 'ln_LDA_03', 'ln_LDA_04']]
# append on the clustering
cls_fam = DBSCAN(eps = eps, min_samples = min_pts)
cls_fam.fit(X1)
X1.describe().T
newfeature_dbscan = cls_fam.labels_ # the labels from kmeans clustering
print ("eps, min_pts = ", eps, min_pts)
plt.figure(figsize=(12, 12))
plt.subplot(221)
X1 = X1.values
plt.scatter(X1[:, 0], X1[:, 1],
c = newfeature_dbscan,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05)
plt.xlabel('LDA_00'), plt.ylabel('LDA_01')
plt.grid()
plt.subplot(222)
plt.scatter(X1[:, 0], X1[:, 2],
c = newfeature_dbscan,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05)
plt.xlabel('LDA_00'), plt.ylabel('LDA_02')
plt.grid()
plt.subplot(223)
plt.scatter(X1[:, 0], X1[:, 3],
c = newfeature_dbscan,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05)
plt.xlabel('LDA_00'), plt.ylabel('LDA_03')
plt.grid()
plt.subplot(224)
plt.scatter(X1[:, 0], X1[:, 4],
c = newfeature_dbscan,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05)
plt.xlabel('LDA_00'), plt.ylabel('LDA_04')
plt.grid()
plt.show();
# y = df_imputed['Survived']
# X = df_imputed[['IsMale','Pclass','Fare']]
# X = np.column_stack((X,pd.get_dummies(newfeature_fam)))
# acc = cross_val_score(clf,X,y=y,cv=cv)
# params.append((n_fare,n_fam,acc.mean()*100,acc.std()*100)) # save state
# print (eps,mpts,"Average accuracy = ", acc.mean()*100, "+-", acc.std()*100)
from sklearn.cluster import SpectralClustering
X1 = df[['ln_LDA_00','ln_LDA_01', 'ln_LDA_02', 'ln_LDA_03', 'ln_LDA_04']]
nclust = 6
# If a string, this may be one of
# ‘nearest_neighbors’, ‘precomputed’, ‘rbf’
# or one of the kernels supported by sklearn.metrics.pairwise_kernels
spc = SpectralClustering(n_clusters = nclust, affinity = 'nearest_neighbors')
labels = spc.fit_predict(X1)
plt.figure(figsize=(12, 12))
plt.subplot(221)
X1 = X1.values
plt.scatter(X1[:, 0], X1[:, 1],
c = labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05)
plt.xlabel('LDA_00'), plt.ylabel('LDA_01');
plt.grid()
plt.subplot(222)
plt.scatter(X1[:, 0], X1[:, 2],
c = labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05)
plt.xlabel('LDA_00'), plt.ylabel('LDA_02');
plt.grid()
plt.subplot(223)
plt.scatter(X1[:, 0], X1[:, 3],
c = labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05)
plt.xlabel('LDA_00'), plt.ylabel('LDA_03')
plt.grid()
plt.subplot(224)
plt.scatter(X1[:, 0], X1[:, 4],
c = labels,
cmap = plt.cm.rainbow,
s = 50,
linewidths = 0,
alpha = 0.05)
plt.xlabel('LDA_00'), plt.ylabel('LDA_04')
plt.grid()
plt.show();